Main simulation
generate_cases = function(effect, freq, N, bias = 0, noise_xs = 0, contam=5){
X_min = -5
X_max = -X_min
if(contam > X_max | contam < X_min) stop('contam has to be between X_max and X_min')
# latent variable X
xs = sort(runif(N, min = X_min, max = X_max)) + (X_max-contam)
# transform to get normally distributed RV Y - plus noise in ranking
ys = qnorm(rank(xs + rnorm(N, mean = 0, sd = noise_xs))/(N+1))
p = inv.logit(xs) - bias
effect_xs = effect * p
freqs = freq * (1-effect_xs)
g = rbinom(n = N, size = 1, prob = freqs)
return(list(g, ys, freqs, xs))
}
# example
N = 2000
out = generate_cases(effect = 0.9, freq = 0.06, N = N, bias = 0, noise_xs = 0, contam = 4)
summary(glm(out[[1]] ~ out[[2]], family='binomial'))
##
## Call:
## glm(formula = out[[1]] ~ out[[2]], family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4914 -0.2265 -0.1844 -0.1500 3.3242
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.0384 0.1827 -22.103 < 2e-16 ***
## out[[2]] -0.6033 0.1613 -3.741 0.000183 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 399.91 on 1999 degrees of freedom
## Residual deviance: 385.65 on 1998 degrees of freedom
## AIC: 389.65
##
## Number of Fisher Scoring iterations: 7
mv_avg = Np_fit(x = out[[1]], y = out[[2]], BBs = 100, lambda = 1)
plot((1:N)/N, colMeans(mv_avg), type='l')

plot(out[[4]], out[[3]], type='l')

Simulations
Sim parameters
Ns = round(10^(seq(2,4,length.out = 20)))
effect = .5
freq = .1
BBs = 10000
RUN_SIMS = F
Bias means that the allele frequency is in fact increased in individuals who are contaminants (not correct phenotype). Noise is added to the latent variable xs to get the ys
no noise & no bias
if(RUN_SIMS){
tic()
sim_out_all = foreach(n = Ns) %dopar% {
sim_out = array(dim = c(BBs, 2))
for(b in 1:BBs){
controls = sample(0:1, size = n, replace = T, prob = c(1-freq, freq))
cases = generate_cases(effect=effect, freq=freq, N=n, bias = 0,noise_xs = 0, contam = 4)
out = c(rep(0,n), rep(1,n))
sim_out[b,] = holmes_test_simple(y = c(rep(0,n), cases[[2]]),
g = c(controls, cases[[1]]),
case_ind = out==1)
}
sim_out
}
toc()
save(sim_out_all, file = 'temp.RData')
} else {
load(file = 'temp.RData')
}
par(las = 1, bty='n', family='serif', cex.axis=1.3, cex.lab=1.3)
plot(log10(Ns), lapply(sim_out_all, function(x) mean(x[,1] < 0.001 )),
xaxt = 'n', type='l', ylim=c(0,1), panel.first=grid(),lwd=3,
xlab='Sample size', ylab = 'Power for alpha = 0.001')
axis(1, at = 2:4, labels = 10^(2:4))
axis(1, at = log10(seq(100, 1000, by = 100)), labels = NA)
axis(1, at = log10(seq(1000, 10000, by = 1000)), labels = NA)
lines(log10(Ns), lapply(sim_out_all, function(x) mean(x[,2] < 0.001 )), lwd=3, col='red')
legend('topleft', col=c('black','red'), legend = c('M1 versus M0', 'M2 versus M0'), lwd=3, inset=0.03)

Same simulation but adding noise to xs
if(RUN_SIMS){
tic()
sim_out_all2 = foreach(n = Ns) %dopar% {
sim_out = array(dim = c(BBs, 2))
for(b in 1:BBs){
controls = sample(0:1, size = n, replace = T, prob = c(1-freq, freq))
cases = generate_cases(effect=effect, freq=freq, N=n, bias = 0, noise_xs = .5, contam = 4)
out = c(rep(0,n), rep(1,n))
sim_out[b,] = holmes_test_simple(y = c(rep(0,n), cases[[2]]),
g = c(controls, cases[[1]]),
case_ind = out==1)
}
sim_out
}
toc()
save(sim_out_all2, file = 'temp2.RData')
} else {
load(file = 'temp2.RData')
}
par(las = 1, bty='n', family='serif', cex.axis=1.3, cex.lab=1.3)
plot(log10(Ns), lapply(sim_out_all2, function(x) mean(x[,1] < 0.001 )),
xaxt = 'n', type='l', ylim=c(0,1), panel.first=grid(),lwd=3,
xlab='Sample size', ylab = 'Power for alpha = 0.001')
axis(1, at = 2:4, labels = 10^(2:4))
axis(1, at = log10(seq(100, 1000, by = 100)), labels = NA)
axis(1, at = log10(seq(1000, 10000, by = 1000)), labels = NA)
lines(log10(Ns), lapply(sim_out_all2, function(x) mean(x[,2] < 0.001 )), lwd=3, col='red')
legend('topleft', col=c('black','red'), legend = c('M1 versus M0', 'M2 versus M0'), lwd=3, inset=0.03)

Same simulation as 1 but adding bias
if(RUN_SIMS){
tic()
sim_out_all3 = foreach(n = Ns) %dopar% {
sim_out = array(dim = c(BBs, 2))
for(b in 1:BBs){
controls = sample(0:1, size = n, replace = T, prob = c(1-freq, freq))
cases = generate_cases(effect=effect, freq=freq, N=n, bias = 1/3, noise_xs = 0, contam = 4)
out = c(rep(0,n), rep(1,n))
sim_out[b,] = holmes_test_simple(y = c(rep(0,n), cases[[2]]),
g = c(controls, cases[[1]]),
case_ind = out==1)
}
sim_out
}
toc()
save(sim_out_all3, file = 'temp3.RData')
} else {
load(file = 'temp3.RData')
}
par(las = 1, bty='n', family='serif', cex.axis=1.3, cex.lab=1.3)
plot(log10(Ns), lapply(sim_out_all3, function(x) mean(x[,1] < 0.001 )),
xaxt = 'n', type='l', ylim=c(0,1), panel.first=grid(),lwd=3,
xlab='Sample size', ylab = 'Power for alpha = 0.001')
axis(1, at = 2:4, labels = 10^(2:4))
axis(1, at = log10(seq(100, 1000, by = 100)), labels = NA)
axis(1, at = log10(seq(1000, 10000, by = 1000)), labels = NA)
lines(log10(Ns), lapply(sim_out_all3, function(x) mean(x[,2] < 0.001 )), lwd=3, col='red')
legend('topleft', col=c('black','red'), legend = c('M1 versus M0', 'M2 versus M0'), lwd=3, inset=0.03)

par(las = 1, bty='n', family='serif', cex.axis=1.3, cex.lab=1.3, mfrow=c(2,2))
my_cases = generate_cases(effect=effect, freq=freq, N=10^6, bias = 0,
noise_xs = .5, contam = 4)
plot(my_cases[[4]], 100*my_cases[[3]], type='l', xlab='Latent variable',
ylab='Frequency of marker (%)')
abline(h = freq*100, lty=2)
mtext(text = 'A', side = 3, line = 1, adj = 0)
plot(log10(Ns), lapply(sim_out_all2, function(x) mean(x[,1] < 0.001 )),
xaxt = 'n', type='l', ylim=c(0,1), panel.first=grid(),lwd=3,
xlab='Sample size', ylab = 'Power for alpha = 0.001', xlim=c(log10(300), 4))
xvals = c(300, 1000,3000,10000)
axis(1, at = log10(xvals), labels = xvals)
axis(1, at = log10(seq(300, 1000, by = 100)), labels = NA)
axis(1, at = log10(seq(1000, 10000, by = 1000)), labels = NA)
abline(h = 0.5, lty=2)
lines(log10(Ns), lapply(sim_out_all2, function(x) mean(x[,2] < 0.001 )), lwd=3, col='red')
legend('topleft', col=c('black','red'), legend = c('M1 versus M0', 'M2 versus M0'), lwd=3, inset=0.03)
mtext(text = 'B', side = 3, line = 1, adj = 0)
my_cases = generate_cases(effect=effect, freq=freq, N=10^6,
bias = 1/3, noise_xs = 0, contam = 4)
plot(my_cases[[4]], 100*my_cases[[3]], type='l', xlab='Latent variable',
ylab='Frequency of marker (%)')
abline(h = freq*100, lty=2)
mtext(text = 'C', side = 3, line = 1, adj = 0)
plot(log10(Ns), lapply(sim_out_all3, function(x) mean(x[,1] < 0.001 )),
xaxt = 'n', type='l', ylim=c(0,1), panel.first=grid(),lwd=3,
xlab='Sample size', ylab = 'Power for alpha = 0.001', xlim=c(log10(300), 4))
axis(1, at = log10(xvals), labels = xvals)
axis(1, at = log10(seq(300, 1000, by = 100)), labels = NA)
axis(1, at = log10(seq(1000, 10000, by = 1000)), labels = NA)
abline(h = 0.5, lty=2)
lines(log10(Ns), lapply(sim_out_all3, function(x) mean(x[,2] < 0.001 )), lwd=3, col='red')
legend('topleft', col=c('black','red'), legend = c('M1 versus M0', 'M2 versus M0'), lwd=3, inset=0.03)
mtext(text = 'D', side = 3, line = 1, adj = 0)
